#Dennis Moskov, Master Thesis
#CT for classification with CV
#conversion, selectivity and yield
#folds by article

#using "rpart" package
#install.packages("rpart")
#library(rpart)

#packages for fancy plot
#fancy tree plot
#install.packages("rattle")
#install.packages("rpart.plot")
#library(rattle)
#library(rpart.plot)

#randomly shuffle the data
set.seed(77)                      # seed for reproducibility
DBt<-DB[sample(nrow(DB)),]

#initiate possible results
results<-rbind(c("Conversion","Selectivity","Yield"),c("X.MeOH","S.MeOH","Y.MeOH"),c(length(DBt)-2,length(DBt)-1,length(DBt)))

#loop through different outcomes
for (r in 1:3) {

#use desired outcome
useDB<-DBt[-c(1,as.numeric(results[3,-r]))]


#test matrix
matte<-list()
matte.names<-c("number","fold","predicted","observed")
matte$observed<-as.character(useDB[,length(useDB)])

#divide data by article number
ref.num.CV <- unique(DBt[,1])

#choose number of folds
k <- 5	

#build folds with unique article numbers
for(j in 1:k){
  if(j<k){
    change.index <- ref.num.CV[trunc(length(ref.num.CV)/k*(j-1)+1):trunc(length(ref.num.CV)/k*j)]
  }else{
    change.index <- ref.num.CV[trunc(length(ref.num.CV)/k*(j-1)+1):(length(ref.num.CV))]
  }
  for(l in 1:length(change.index)){
    useDB[(DBt[,1]==change.index[l]),1] = j
  }
}

#initiate list for used variables for tree construction
tvar<-vector("list", k)

#initiate matrix for variable importance analysis
vi<-list()


#perform k fold cross validation
for(i in 1:k){
    #segement data by fold  
    testIndexes <- which(useDB[,1]==i,arr.ind=TRUE)
    testData <- useDB[testIndexes, ]               #test data fold k
    testData <- testData[-1]			#remove fold column
    trainData <- useDB[-testIndexes, ]             #training data fold k
    trainData <- trainData[-1]			#remove fold column
 
    #grow tree
    form <- paste(names(trainData)[length(trainData)], "~", paste(names(trainData)[-length(trainData)], collapse=" + "))
    fit<-rpart(form, data=trainData, method="class",control=rpart.control(minbucket =round(nrow(useDB)*0.02)))
    
    #prune tree
    pfit<-prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"] )
 
    #predict and save most probable classes of testData
    predte <- predict(object=pfit,newdata=testData,type="class")

    #save to matte list
    matte$number<-c(matte$number,as.numeric(names(predte)))
    matte$fold<-c(matte$fold,rep(i,length(predte)))
    matte$predicted<-c(matte$predicted,as.character(unname(predte)))

#save used variables for each fold
tvar[[i]]<-levels(fit$frame$var)[-1]

#save variable importance for each fold
vi<-cbind(vi,pfit[13])

}

 #confusion matrix for testData
 confute<-table(matte$predicted,matte$observed) 

 #missclassification error for test data
 misste<-(sum(confute)-sum(diag(confute)))/sum(confute)


#matrix of used variables in each fold
max.len <- max(sapply(tvar, length))
corrected.list <- lapply(tvar, function(x) {c(x, rep(NA, max.len - length(x)))})
tvar <- do.call(cbind, corrected.list)
colnames(tvar, do.NULL = FALSE)
colnames(tvar) <-  colnames(tvar, do.NULL = FALSE, prefix = "fold ")


#plot regression tree
x11()
fancyRpartPlot(pfit,main=paste("Decision Tree for Classification of MeOH",results[1,r]),sub=paste("Missclassification Error for Predicted Classes: ",round(misste,digits=4)))

write.csv(confute, file =paste(results[1,r]," confusionPred.csv"))
write.csv(tvar,file=paste(results[1,r]," Variables_Tree_Construction.csv"))
write.csv(cbind(names(unlist(vi[1:10])),unlist(vi[1:10])), file =(paste(results[1,r]," variable_importance.csv")))

capture.output(asRules(pfit), file = paste(results[1,r]," Rules.txt"))

#save classification tree to file
pdf(paste(results[1,r]," Tree.pdf"))
fancyRpartPlot(pfit,main=paste("Decision Tree for Classification of MeOH",results[1,r]),sub=paste("Missclassification Error for Predicted Classes: ",round(misste,digits=4)))
dev.off()

}
